Lots of data exploration inspiration from Michael Griffiths: https://www.kaggle.com/msjgriffiths/exploratory-analysis/code
# Load data
animes <- read.csv('../data/clean/animes.csv', header = TRUE, stringsAsFactors = FALSE)
genres <- read.csv('../data/clean/genres.csv', header = TRUE, stringsAsFactors = FALSE)
ratings <- read.csv('../data/raw/no_null_ratings.csv', header = TRUE)
N <- 10
df <- data.frame(Anime = animes$name, Rating = animes$rating, stringsAsFactors = TRUE) # want names as factors for plotting
df <- df[order(df$Rating, decreasing = TRUE), ] # sort by ranking
df$Anime <- factor(df$Anime, levels = df$Anime) # to retain the order in plot
df <- df[1:N,] # cut off the top N
df %>% ggplot(aes(x=Anime, y=Rating)) +
geom_bar(stat="identity", width=.5, fill="tomato3") +
coord_cartesian(ylim = c(9.0, 10.0)) +
labs(title="Top Anime Ratings",
caption="source: MAL dataset") +
theme(axis.text.x = element_text(angle=65, vjust=0.6))
animes %>%
ggplot(aes(rating, group = type)) +
labs(title = "Average Anime Ratings by Type") +
geom_density(aes(fill = type), alpha = .4) +
xlim(0, 10)
## Warning: Removed 230 rows containing non-finite values (stat_density).
animes %>%
group_by(type) %>%
summarise(
average.viewers = mean(members),
sd.viewers = sd(members),
average.rating = mean(rating, na.rm = T),
sd.rating = sd(rating, na.rm = T)
) %>%
formattable()
| type | average.viewers | sd.viewers | average.rating | sd.rating |
|---|---|---|---|---|
| 6537.400 | 13278.495 | NaN | NA | |
| Movie | 10369.094 | 30898.076 | 6.318414 | 1.2119725 |
| Music | 1311.840 | 4548.136 | 5.588996 | 0.9584401 |
| ONA | 4114.030 | 12399.959 | 5.643298 | 1.1270907 |
| OVA | 5986.140 | 15026.128 | 6.375221 | 0.8583584 |
| Special | 7676.061 | 15546.290 | 6.523501 | 0.8877620 |
| TV | 42683.658 | 89121.009 | 6.902299 | 0.8635256 |
animes_subset_ratings <- animes %>% select("anime_id", "rating")
genres_with_ratings <- inner_join(genres, animes_subset_ratings, by = "anime_id")
g <- genres_with_ratings %>%
ggplot(aes(rating, group = genre)) +
geom_density(aes(fill = genre), alpha = .4)
ggplotly(g)
## Warning: Removed 690 rows containing non-finite values (stat_density).
g_bar <- ratings %>% ggplot(aes(x = factor(rating))) +
geom_bar() # Bar plot
ggplotly(g_bar)
# Create a fake grouping variable, for a violin plot of 1 dim
g_violin <- ratings %>% ggplot(aes(x = factor(0), y = rating)) +
geom_violin(trim = FALSE, adjust = 2) +
xlab("") + geom_boxplot(width=0.1)
ggplotly(g_violin)